rm(list=ls())

source('param_general.R')

args <- commandArgs(trailingOnly = TRUE)
fileTyp <- args[1]

useDatFinal <- readRDS(paste(datDir,'ld_women_process_',fileTyp,'.rds', sep = ''))
travel <- data.table(read.dta(travelFile))

##Include hospitals with over 10 admissions in the data for a given year
hosp <- unique(useDatFinal[,list(admits=.N),by=list(hosp_id,yearDat)],by=c("hosp_id","yearDat"))
hosp <- subset(hosp,admits>=10,select=c(-admits))

pat <- copy(useDatFinal)
outsideOption <- data.table(yearDat = unique(pat$yearDat),hosp_id = 0)

##Lag patient variables
setkey(pat, maskssn,yearDat)
pat[,':='(mult_admit=.N,birthNum = .I - min(.I) + 1L),by=list(maskssn)]
pat[mult_admit>1,':='(hosp_id_prev=c(NA,head(hosp_id,-1)),
                      pat_zip_prev=c(NA,head(pat_zip,-1)),
                      atten_phyid_prev=c(NA,head(atten_phyid,-1))),
    by=list(maskssn)]

message(paste0('Number of admissions at beginning = ', nrow(pat)))

##All normal deliveries in past (including this one) (MSDRG 775 and 373 are codes for vaginal delivery)
setkey(pat,maskssn,birthNum)
pat[,normalLD_hist := (cumsum(normalLD_indic) == birthNum), by = list(maskssn)]
pat[,normalVaginalLD_hist := (cumsum(normalLD_indic * (msdrg == 775 | msdrg == 373)) == birthNum), by = list(maskssn)]
pat[,normalCSLD_hist := (cumsum(normalLD_indic * (msdrg == 766 | msdrg == 371)) == birthNum), by = list(maskssn)]
pat[,age06 := as.numeric((as.Date('2006-01-01') - bday))/365]


##Create choice sets for each person
travelBind <- rbind(travel,data.table(pat_zip=unique(travel$pat_zip),distance=NA,time=NA,hosp_id=0,Hospital = NA))
hospBind <- rbind(hosp,outsideOption,use.names=TRUE)

##Merge drive times to patients
setkey(pat,pat_zip)
setkey(travelBind,pat_zip)
choicesMult <- merge(pat,travelBind[time<=timeMkt|hosp_id==0],allow.cartesian=TRUE,all.x=TRUE)

##Post processing
setnames(choicesMult,c("hosp_id.x","hosp_id.y"),c("hosp_id_chosen","hosp_id_alt"))
choicesMult[,obs_choice:=hosp_id_chosen==hosp_id_alt]

##Collect missing zip codes
missZip <- choicesMult[is.na(hosp_id_alt),.N,by=list(pat_zip)]## List missing zip codes and observations (presumably no route could be calculated from these places)
message('Admissions dropped due to no route from zip = ', sum(missZip$N))
choicesMult <- choicesMult[!pat_zip%in%missZip$pat_zip] ##Drop missing zips


##Remove hospitals from choice set for a zip if less than minZipPpl go there 
choicesMult[,admitHospZip := sum(obs_choice & age06 <= 21,na.rm = TRUE), by = list(hosp_id_alt, pat_zip)]
choicesMult[, inChoiceSet := (admitHospZip >= minZipHospPpl) | hosp_id_alt == 0]

## Sets to outside option if not within set of hospitals
choicesMult[inChoiceSet == TRUE,actHosp:=sum(obs_choice),by=list(admission)]
choicesMult[(actHosp==0 | is.na(actHosp)) & hosp_id_alt==0,obs_choice:=TRUE]

##Merge hospitals in choice set
setnames(hospBind,'hosp_id','hosp_id_alt')
choicesMult[,hosp_id_alt := as.numeric(hosp_id_alt)]
hospBind[,hosp_id_alt := as.numeric(hosp_id_alt)]
setkey(choicesMult,yearDat,hosp_id_alt)
setkey(hospBind,yearDat,hosp_id_alt)
choicesMult <- merge(choicesMult,hospBind,allow.cartesian=TRUE)

choicesMult[inChoiceSet == TRUE ,nHospChoose := .N, by = 'admission']

setnames(choicesMult,c("distance","time"),c("distance_current","time_current"))

##Define lag dependent variable
choicesMult[,chosenPrev := (hosp_id_prev == hosp_id_alt) & hosp_id_alt != 0]

##Select variables used in estimation
estimDat <- choicesMult[inChoiceSet == TRUE,
                        list(admission, yearDat, adate, age,pat_zip, pat_county, 
                             hispanic, 
                             normalLD_indic, normalLD_hist,normalCSLD_hist, normalVaginalLD_hist, birthNum, 
                             maskssn, age06,
                             atten_phyid, oper_phyid, 
                             time_current, hosp_id_alt, obs_choice, mult_admit, payer,
                             chosenPrev, hosp_id_prev, pat_zip_prev, atten_phyid_prev,
                             msdrg,
                             admitHospZip,nHospChoose)]

if(estimDat[,sum(obs_choice)] != nrow(pat) - sum(missZip$N)) stop('Patients inadvertantly added or dropped')

##Frequency of attending doctor in each hospital in each year. 
physHospFreq <- estimDat[obs_choice == 1 & hosp_id_alt != 0, list(birthDocHosp = .N),by = list(hosp_id_alt,atten_phyid,yearDat)]


##Merge doctor data to dataset
setkey(physHospFreq, yearDat, atten_phyid, hosp_id_alt)
setkey(estimDat, yearDat, atten_phyid, hosp_id_alt)
estimDat <- merge(estimDat,physHospFreq,all.x = TRUE)

if(length(unique(estimDat$admission))!=sum(estimDat$obs_choice)) stop('Choices must be same as admissions')

##Define assorted variables
setkey(estimDat, maskssn, hosp_id_alt)
##First birth hospital
estimDat[mult_admit > 1,chosenFirstHospTemp := (obs_choice == TRUE & birthNum == 1)]
##Second birth hospital
estimDat[mult_admit > 1,chosenSecHospTemp := (obs_choice==TRUE & birthNum == 2)]

##Take all variables from above and make constant at patient level
varNames <- c('chosenFirstHosp','chosenSecHosp')
varNamesTemp <- paste0(varNames, 'Temp')
for (nm in varNames){
    estimDat[mult_admit > 1, (nm) := as.integer(max(get(paste0(nm,'Temp')), na.rm=TRUE)),by = list(maskssn,hosp_id_alt)]
}


##Same doctor for first and second birth
estimDat[birthNum == 2 & mult_admit > 1,sameDocFirst2Temp := as.integer(atten_phyid == atten_phyid_prev)]
##Same doctor for second and third birth
estimDat[birthNum == 3 & mult_admit > 1, sameDocTwoThreeTemp := as.integer(atten_phyid==atten_phyid_prev)]
##Went to first birth hospital for third birth
estimDat[birthNum == 3 & mult_admit > 1,firstHosp3Temp := as.integer(chosenFirstHosp == 1 & obs_choice == TRUE)]
##Went to second birth hospital for third birth
estimDat[birthNum == 3 & mult_admit > 1,secondHosp3Temp := as.integer(chosenSecHosp == 1 & obs_choice == TRUE)]
##Did not move houses b/w first and second birth 
estimDat[birthNum == 2 & mult_admit > 1,noMove2Temp := as.integer(pat_zip == pat_zip_prev)]
##Did not move houses on third birth (needed to identify switching costs)
estimDat[birthNum == 3 & mult_admit > 1,noMove3Temp := as.integer(pat_zip==pat_zip_prev)]
##first births normal
estimDat[birthNum == 2 & mult_admit > 1, first2NormTemp := as.integer(normalLD_hist)]
estimDat[birthNum == 2 & mult_admit > 1, first2NormVaginalTemp := as.integer(normalVaginalLD_hist)]
estimDat[birthNum == 2 & mult_admit > 1, first2NormCSTemp := as.integer(normalCSLD_hist)]
estimDat[birthNum == 3 & mult_admit > 1, first3NormTemp := as.integer(normalLD_hist)]


##Take all variables from above and make constant at patient level
varNames <- c('sameDocFirst2','sameDocTwoThree','firstHosp3','secondHosp3','noMove2','noMove3','first2Norm','first3Norm','first2NormVaginal','first2NormCS')
varNamesTemp <- paste0(varNames, 'Temp')
for (nm in varNames){
    print(nm)
    estimDat[mult_admit > 1, (nm) := as.integer(max(get(paste0(nm,'Temp')), na.rm=TRUE)),by = list(maskssn)]
}

##Delivery doc for first three births
estimDat <- merge(unique(estimDat[birthNum == 1 & mult_admit > 1, list(firstAttenPhys = atten_phyid , maskssn)]),estimDat, by = 'maskssn', all.y = TRUE)
estimDat <- merge(unique(estimDat[birthNum == 2 & mult_admit > 1, list(secondAttenPhys = atten_phyid , maskssn)]),estimDat, by = 'maskssn', all.y = TRUE)
estimDat <- merge(unique(estimDat[birthNum == 3 & mult_admit > 1, list(thirdAttenPhys = atten_phyid , maskssn)]),estimDat, by = 'maskssn', all.y = TRUE)



##Construct set of people that will be used in Chamberlain/Honore Estimation 
##Chamberlin 1980 FE approach (see C+T p798) and Honore + Kyriazidou 2000
setkey(estimDat, maskssn,hosp_id_alt)
estimDat[mult_admit > 1 & birthNum <= 2 & hosp_id_alt != 0, nHospChosen := sum(obs_choice), by=list(maskssn, hosp_id_alt)]  ##Number of hospitals chosen in first two births
estimDat[mult_admit > 1,hospSwitchFirst2 := (sum(nHospChosen == 1, na.rm=TRUE) == 4), by = 'maskssn'] ##People who went to different hospitals for first two births, where both were in choice set both times


##Delete temporary variables
for (nm in grep('Temp',names(estimDat), value = TRUE))  estimDat[,(nm) := NULL] 

estimDatMerge <- copy(estimDat)

##Merge in metro area data
zipMSAXWalk <- data.table(read.dta("/NDrive/SIL-Common/Hospital Files/Geography/zip_msa_xwalk2010.dta"))
setkey(estimDatMerge,pat_zip)
estimDatMerge[zipMSAXWalk[msa_name == 'Jacksonville' & msa_state == 'FL'],jacksonvilleMSA := TRUE]
estimDatMerge[is.na(jacksonvilleMSA),jacksonvilleMSA := FALSE]

##Data modifications to prepare for estimation
estimDatMerge[birthNum == 1, chosenPrev:=0L] ##No switching cost for first time moms
estimDatMerge[hosp_id_alt == 0,':='(time_current = 0., chosenPrev = FALSE)]
estimDatMerge[, ':=' (chosenPrev = as.integer(chosenPrev))]


##Generate dataset to estimate chamberlain and honore models
chambDatMelt <- melt(estimDatMerge[hospSwitchFirst2 == TRUE & 
                                (chosenFirstHosp == 1 | chosenSecHosp == 1) & birthNum <=3,
                              list(maskssn,birthNum,age06, time_current, chosenFirstHosp,noMove3,sameDocFirst2,
                                   sameDocTwoThree,firstHosp3,secondHosp3,first2Norm,first2NormVaginal,first2NormCS,first3Norm, 
                                   pat_zip,hosp_id_alt, commercial = payer == 'Commercial',selfpay = payer == 'SelfCharity',
                                   adate, age,
                                   jacksonvilleMSA,
                                   normalLD = normalLD_indic,
                                   mco = payer == 'MedicaidMCO', ffs = payer == 'Medicaid')],
                     id.vars=c('maskssn','sameDocFirst2','sameDocTwoThree','birthNum', 'age06', 
                         'chosenFirstHosp','noMove3','firstHosp3','secondHosp3','first2Norm','first3Norm','first2NormVaginal','first2NormCS'))

chambDat <- data.table(dcast(chambDatMelt,
                                age06 + maskssn+sameDocFirst2+sameDocTwoThree+noMove3+  firstHosp3+secondHosp3+first2Norm+first2NormVaginal+first2NormCS+first3Norm  ~
                                  variable+birthNum+chosenFirstHosp))##Only look at hospitals that were chosen for first or second birth

message('Patients in chamberlain dataset = ',nrow(chambDat), '\nPatients in estimation dataset = ', length(unique(estimDatMerge$maskssn)),'\nAdmissions in estimation dataset = ',length(unique(estimDatMerge$admission)), '\nAll numbers are before age restrictions imposed')

chambDat[,one := 1]
chambDat[,time_current_diff:=(time_current_2_0-time_current_2_1)-(time_current_1_0-time_current_1_1)]
chambDat[,time_current_diff_log:=(log(time_current_2_0)-log(time_current_2_1))-(log(time_current_1_0 )-log(time_current_1_1))]
chambDat[,time2_current_diff:=(time_current_2_0^2-time_current_2_1^2)-(time_current_1_0^2-time_current_1_1^2)]
chambDat[,':='(time_diff_firstBirthHosp = time_current_2_1 - time_current_1_1,
               time_diff_secBirthHosp = time_current_2_0 - time_current_1_0)]

##Save data
message('saving files')
estimDatPrev <- copy(estimDat)
estimDat <- copy(estimDatMerge)
save(estimDat, file=paste0(datDir,'estimDat_minPpl_',fileTyp,'_',minZipHospPpl,'.RData') )

save(choicesMult, file=paste0(datDir,'allProcessedDat_',fileTyp,'.RData') )
estimDat[oper_phyid == "",oper_phyid := NA]
write.dta(estimDat,paste0(datDir,'estimDat_minPpl_',fileTyp,'_',minZipHospPpl,'.dta'))
save(chambDat, file=paste0(datDir,'chambDat_minPpl_',fileTyp,'_',minZipHospPpl,'.RData'))


